source("ingest_data.R")
levels(data$repo)
## [1] "IntTest" "Jupiter" "Mars"    "Mercury" "Neptune" "Saturn"  "Uranus" 
## [8] "Venus"
levels(data$committerteam)
##  [1] "Arch"    "Blue"    "Brown"   "Green"   "Orange"  "Pink"    "Red"    
##  [8] "UI"      "Unknown" "Violet"  "Yellow"
levels(data$authorteam)
##  [1] "Arch"    "Blue"    "Brown"   "Green"   "Orange"  "Pink"    "Red"    
##  [8] "UI"      "Unknown" "Violet"  "Yellow"

We have 10 teams (plus one “Unknown”) working in 8 repositories.

Exploratory Data Analysis

Introduced duplicates in files, per repo.

low_stat <- function(x) quantile(x, 0.25, type=3)
median_stat <- function(x) quantile(x, 0.5, type=3)
high_stat <- function(x) quantile(x, 0.75, type=3)
q95_stat <- function(x) q95(x)
very_high_stat <- function(x) quantile(x, 0.99, type=3)
max_stat <- function(x) max(x)

m1 <- data |> select(ADD, DEL, COMPLEX, DUP, INTROD) |> summarise(across(ADD:INTROD, min)) |> pivot_longer(cols=everything(), values_to="min")
m2 <- data |> select(ADD, DEL, COMPLEX, DUP, INTROD) |> summarise(across(ADD:INTROD, low_stat)) |> pivot_longer(cols=everything(), values_to="q25")
m3 <- data |> select(ADD, DEL, COMPLEX, DUP, INTROD) |> summarise(across(ADD:INTROD, median_stat)) |> pivot_longer(cols=everything(), values_to="median")
m4 <- data |> select(ADD, DEL, COMPLEX, DUP, INTROD) |> summarise(across(ADD:INTROD, high_stat)) |> pivot_longer(cols=everything(), values_to="q75")
m5 <- data |> select(ADD, DEL, COMPLEX, DUP, INTROD) |> summarise(across(ADD:INTROD, q95_stat)) |> pivot_longer(cols=everything(), values_to="q95")
m6 <- data |> select(ADD, DEL, COMPLEX, DUP, INTROD) |> summarise(across(ADD:INTROD, very_high_stat)) |> pivot_longer(cols=everything(), values_to="q99")
m7 <- data |> select(ADD, DEL, COMPLEX, DUP, INTROD) |> summarise(across(ADD:INTROD, max)) |> pivot_longer(cols=everything(), values_to="max")

metrics <- merge(
merge(
  merge(
    merge(
      merge(
        merge(m1, m2, by="name"),
        m3, by="name"),
      m4, by="name"),
    m5, by="name"),
  m6, by="name"),
m7, by="name")
metrics
##      name min q25 median q75 q95 q99  max
## 1     ADD   0   1      6  28 143 370 3772
## 2 COMPLEX   0   3     16  52 282 633 1244
## 3     DEL   0   0      2  12  92 311 3413
## 4     DUP   0   0      0   2  36  99  664
## 5  INTROD   0   0      0   0   1   5  150
(p <- data |> filter(INTROD > 0) |> group_by(repo) |> ggplot(aes(x=INTROD)) + geom_histogram(binwidth=1) + facet_wrap(~ repo) + ylab("Number of introduced duplicates in each file change")
)

Most of the introduced duplicates are small (single-digits), but some are large, ranging into the hundreds for the IntTest repo.

p + scale_y_continuous(limits=c(0,30))
## Warning: Removed 17 rows containing missing values or values outside the scale range
## (`geom_bar()`).

Limiting the scale to 30 shows the less frequently occurring values more clearly.

Clone introduction ratio per team and repository

Proportion of file changes that introduce at least one clone, per team and repository. Size of the dot is proportional to the number of files changed by the team in the repository.

changesPerRepoAndTeam <- data |> group_by(repo, committerteam) |> summarise(fileschanged=n())
zerosPerRepoAndTeam <- data |> filter(INTROD == 0) |> group_by(repo, committerteam) |> summarise(zeros=n())
zeros_ratio <- merge(changesPerRepoAndTeam, zerosPerRepoAndTeam) |> mutate(introdRatio = 1-(zeros/fileschanged)) |> arrange(introdRatio)

(p <- zeros_ratio |> filter(committerteam != "Unknown") |> ggplot(aes(x=committerteam, y=introdRatio, color=committerteam, size=fileschanged)) + geom_point() + facet_wrap(~ repo) + xlab("team") + scale_color_manual(values=COLOR_BY_TEAM) + theme_bw() + scale_y_continuous(limits=c(0,0.27)) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
)

As we cannot ascertain which team the Unknown team members participate in, we exclude them from this metric.

Additions and deletions per team and repository

Viewing the (log-transformed) added and removed lines, we see some correlation, and also quite a few “pure additions” and “pure removals” (along the x and y axes, respectively).

data |> group_by(repo) |> ggplot(aes(x=logADD, y=logDEL, color=committerteam)) + geom_point() + facet_wrap(~repo)

We see that, overall, there are more duplicates present in the IntTest and Jupiter repos (largest one). And there seems to be at least some indications of correlation with complexity (although there are also some duplicates in files where complexity is 0, i.e. the y axis).

data |> group_by(repo) |> ggplot(aes(x=logCOMPLEX, y=logDUP)) + geom_point() + facet_wrap( ~ repo) + scale_color_manual(values=COLOR_BY_TEAM) + theme_bw()
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.

A more thorough pairs plot reveal that the parameters seem quite independent - at least no obvious correlations are present (though logCOMPLEX and logDUP are somewhat related, as seen above). Plot can be repeated per repo or per team, with the same conclusions.

ggpairs(data |> select(logADD, logDEL, logCOMPLEX, logDUP))

# quantile type 3 to avoid having interpolation of the observations
repo_committerteam <- data |> group_by(repo, committerteam) |> summarize(q95=quantile(INTROD, 0.95, type=3),
                                                                         q99=quantile(INTROD, 0.99, type=3),
                                                                         max=max(INTROD),
                                                                         files=n(),
                                                                         mean_added=mean(ADD),
                                                                         median_added=median(ADD),
                                                                         mean_removed=mean(DEL),
                                                                         median_removed=median(DEL)) |> mutate(team=committerteam)
repo_committerteam |> ggplot(aes(x=team, y=repo)) +
    geom_tile(aes(fill=files)) +
    geom_text(aes(label=round(files, 0)), color="white") +
    xlab("team") + ylab("repo") +
    ggtitle(paste("Observed number of changes to files by team and repo"))

Repositories are named and sized (though not to scale) after the seven neighboring planets, plus the INTTEST repository, which contains integration tests developed in Java.

We note that some teams have not changed files in some repostories, and that some teams have made very many, and some very few, changes to some repositories.

repo_committerteam |> ggplot(aes(x=team, y=repo)) +
    geom_tile(aes(fill=mean_added)) +
    geom_text(aes(label=round(mean_added, 0)), color="white") +
    xlab("team") + ylab("repo") +
    ggtitle(paste("Observed mean number of added lines to files by team and repo"))

repo_committerteam |> ggplot(aes(x=team, y=repo)) +
    geom_tile(aes(fill=median_added)) +
    geom_text(aes(label=round(median_added, 0)), color="white") +
    xlab("team") + ylab("repo") +
    ggtitle(paste("Observed median number of added lines to files by team and repo"))

repo_committerteam |> ggplot(aes(x=team, y=repo)) +
    geom_tile(aes(fill=mean_removed)) +
    geom_text(aes(label=round(mean_removed, 0)), color="white") +
    xlab("team") + ylab("repo") +
    ggtitle(paste("Observed mean number of removed lines to files by team and repo"))

repo_committerteam |> ggplot(aes(x=team, y=repo)) +
    geom_tile(aes(fill=median_removed)) +
    geom_text(aes(label=round(median_removed, 0)), color="white") +
    xlab("team") + ylab("repo") +
    ggtitle(paste("Observed median number of removed lines to files by team and repo"))

We note that adding lines are more common than removing, and that the Architect team leads in removing lines in most repositories.

Introduced duplicates in Jupiter

The number of introduced duplicates follow a Poisson-like (or Negative Binomial) distribution, once you exclude all the zeros, for the zero-inflation part.

plot_introd_issues_in_repo <- function(aRepo, team_selector) {
  data |> filter(repo == aRepo, team_selector(committerteam)) |> mutate(team = committerteam) |> group_by(team) |> ggplot(aes(x=INTROD, color=team, fill=team)) + geom_histogram(binwidth = 1) + facet_wrap(~ team) + scale_fill_manual(name="team", values=teamcolors) + scale_colour_manual(name="team", values=teamcolors) + ylab("number of changed files") + xlab("Number of duplicates introduced in the change") + ylim(0,65)
}
(p <- plot_introd_issues_in_repo(JUPITER, function(x) x %in% c(RED, BLUE, GREEN, ARCH)) + theme_bw() + theme(legend.position = "bottom"))
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_bar()`).

plot_introd_per_team <- function(aRepo, team_selector) {
  data |> filter(repo == aRepo, team_selector(committerteam)) |> mutate(team = committerteam) |> group_by(team) |> ggplot(aes(x=INTROD, color=team, fill=team)) + geom_histogram(aes(y=..density..), binwidth = 1) + facet_wrap(~ team) + ggtitle(paste("Introduced duplicates in repo", aRepo)) + scale_fill_manual(name="team", values=teamcolors) + scale_colour_manual(name="team", values=teamcolors) + ylab("proportion of filechanges") + xlab("Number of duplicates introduced in the change")
}
(p <- plot_introd_per_team(JUPITER, function(x) x %in% c(ARCH, RED, BLUE, GREEN)) + theme_bw() + scale_y_continuous(limits=c(0,0.04)) )
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_bar()`).

(p <- plot_introd_per_team(INTTEST, function(x) x %in% c(ARCH, RED, BLUE, GREEN)) + theme_bw() + scale_y_continuous(limits=c(0,0.04)) )
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_bar()`).

(p <- plot_introd_per_team(URANUS, function(x) x %in% c(ARCH, RED, BLUE, GREEN)) + theme_bw() + scale_y_continuous(limits=c(0,0.06)) )
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_bar()`).

(p <- plot_introd_per_team(VENUS, function(x) x %in% c(ARCH, RED, BLUE, GREEN)) + theme_bw() + scale_y_continuous(limits=c(0,0.04)) )
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_bar()`).

data |> filter(repo == INTTEST, committerteam == ARCH) |> group_by(INTROD) |> tally()
## # A tibble: 4 × 2
##   INTROD     n
##    <int> <int>
## 1      0   238
## 2      1     1
## 3      2     1
## 4      4     1
data |> filter(repo == VENUS, committerteam == ARCH) |> group_by(INTROD) |> tally()
## # A tibble: 1 × 2
##   INTROD     n
##    <int> <int>
## 1      0    86
data |> filter(repo == VENUS, committerteam == GREEN) |> group_by(INTROD) |> tally()
## # A tibble: 3 × 2
##   INTROD     n
##    <int> <int>
## 1      0   347
## 2      1     2
## 3      2     2
data |> filter(committerteam %in% c(BLUE,  GREEN), repo == URANUS) |> group_by(committerteam) |> summarize(median(COMPLEX))
## # A tibble: 2 × 2
##   committerteam `median(COMPLEX)`
##   <fct>                     <dbl>
## 1 Blue                         14
## 2 Green                        16
data |> filter(committerteam %in% c(BLUE,  GREEN), repo == MARS) |> group_by(committerteam, INTROD) |> summarize(n=n(), median(COMPLEX)) |> mutate(freq=n/sum(n))
## # A tibble: 13 × 5
## # Groups:   committerteam [2]
##    committerteam INTROD     n `median(COMPLEX)`    freq
##    <fct>          <int> <int>             <dbl>   <dbl>
##  1 Blue               0   730               6.5 0.952  
##  2 Blue               1    18              26.5 0.0235 
##  3 Blue               2    16              29   0.0209 
##  4 Blue               4     1              16   0.00130
##  5 Blue               6     1              48   0.00130
##  6 Blue              14     1             106   0.00130
##  7 Green              0   561               6   0.862  
##  8 Green              1    65               8   0.0998 
##  9 Green              2    10              15   0.0154 
## 10 Green              3    10              22   0.0154 
## 11 Green              4     3               2   0.00461
## 12 Green              5     1              18   0.00154
## 13 Green             10     1               2   0.00154
data |> filter(committerteam %in% c(ARCH, BLUE, RED, GREEN), repo == JUPITER) |> group_by(committerteam, INTROD) |> summarize(n=n()) |> mutate(freq=n/sum(n))
## # A tibble: 40 × 4
## # Groups:   committerteam [4]
##    committerteam INTROD     n    freq
##    <fct>          <int> <int>   <dbl>
##  1 Arch               0   594 0.953  
##  2 Arch               1    23 0.0369 
##  3 Arch               2     3 0.00482
##  4 Arch               4     2 0.00321
##  5 Arch               6     1 0.00161
##  6 Blue               0  2006 0.945  
##  7 Blue               1    61 0.0287 
##  8 Blue               2    23 0.0108 
##  9 Blue               3    17 0.00801
## 10 Blue               4     8 0.00377
## # ℹ 30 more rows

There are also differences between teams — we see that the ARCH team in this repo introduced very few duplicates, whereas the Blue and Green teams were more comparable.

The pattern is even more pronounced in the IntTest repo

data |> filter(committerteam %in% c(ARCH, BLUE, RED, GREEN), repo == URANUS) |> group_by(committerteam, INTROD) |> summarize(n=n()) |> mutate(freq=n/sum(n))
## # A tibble: 19 × 4
## # Groups:   committerteam [4]
##    committerteam INTROD     n     freq
##    <fct>          <int> <int>    <dbl>
##  1 Arch               0   116 0.967   
##  2 Arch               1     4 0.0333  
##  3 Blue               0   968 0.925   
##  4 Blue               1    44 0.0420  
##  5 Blue               2    23 0.0220  
##  6 Blue               3     5 0.00478 
##  7 Blue               4     4 0.00382 
##  8 Blue               5     2 0.00191 
##  9 Blue              17     1 0.000955
## 10 Green              0   538 0.921   
## 11 Green              1    33 0.0565  
## 12 Green              2    10 0.0171  
## 13 Green              3     2 0.00342 
## 14 Green              5     1 0.00171 
## 15 Red                0   348 0.902   
## 16 Red                1    19 0.0492  
## 17 Red                2    11 0.0285  
## 18 Red                3     7 0.0181  
## 19 Red                4     1 0.00259

Observed distribution of duplicates

(p <- data |> filter(committerteam %in% c(ARCH, BLUE, RED, BROWN), repo %in% c(INTTEST, JUPITER, URANUS)) |> group_by(repo, committerteam, INTROD) |> ggplot(aes(x=INTROD, color=committerteam)) + stat_ecdf() + facet_wrap(~ repo) + 
    xlab("Maximum number of introduced duplicates") +
    ggtitle("Observed cumulative distribution of introduced duplicates") +
    scale_color_manual(values=COLOR_BY_TEAM) + theme_bw() + scale_y_continuous(limits = c(0.85,1.0)) 
)
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_step()`).

(p <- data |> filter(committerteam %in% c(ARCH, BLUE, RED, GREEN), repo %in% c(INTTEST, JUPITER, URANUS)) |> group_by(repo, committerteam) |> ggplot(aes(x=COMPLEX, color=committerteam)) + stat_ecdf() + facet_wrap(~ repo) +  scale_x_continuous(trans="log1p", breaks=c(0,1,3,10,50,200, 1000)) +
  scale_color_manual(values=COLOR_BY_TEAM) + theme_bw() + 
  xlab("Existing complexity in changed file (log scale)") +
  ggtitle("Observed cumulative frequency of existing complexity")
)

In the Jupiter repo, Team Blue are more likely to make changes in less complex files, relative to the Red team. Same for Arch team. In the Neptune repo, the Arch team likewise are less likely to change complex files, but the Blue and Red teams are more similar.

data |> summarize(max(ADD), max(DEL), max(COMPLEX), max(DUP))
##   max(ADD) max(DEL) max(COMPLEX) max(DUP)
## 1     3772     3413         1244      664
data |> filter(committerteam %in% c(ARCH, BLUE, RED, GREEN), repo %in% c(JUPITER)) |> group_by(committerteam) |> summarize(n(), max(ADD), max(DEL), max(COMPLEX), max(DUP))
## # A tibble: 4 × 6
##   committerteam `n()` `max(ADD)` `max(DEL)` `max(COMPLEX)` `max(DUP)`
##   <fct>         <int>      <int>      <int>          <int>      <int>
## 1 Arch            623        489        741           1244         44
## 2 Blue           2123        734        730           1096         53
## 3 Green          1172       1374       1575           1122         38
## 4 Red            2166       2439        683           1244         53
data |> filter(committerteam %in% c(ARCH, BLUE, RED, GREEN), repo %in% c(JUPITER)) |> group_by(committerteam) |> summarize(n(), q95(ADD), q95(DEL), q95(COMPLEX), q95(DUP))
## # A tibble: 4 × 6
##   committerteam `n()` `q95(ADD)` `q95(DEL)` `q95(COMPLEX)` `q95(DUP)`
##   <fct>         <int>      <dbl>      <dbl>          <dbl>      <dbl>
## 1 Arch            623        66       181.             103         20
## 2 Blue           2123        77        38.9            101         16
## 3 Green          1172       135.       64              313         23
## 4 Red            2166       133        59              304         21
(p <- data |> filter(committerteam %in% c(ARCH, BLUE, RED, GREEN), repo %in% c(INTTEST, JUPITER, URANUS)) |> group_by(repo, committerteam) |> ggplot(aes(x=DUP, color=committerteam)) + stat_ecdf() + facet_wrap(~ repo) + scale_x_continuous(trans="log1p", breaks=c(0,1,3,10,50,200, 1000)) +
    scale_color_manual(values=COLOR_BY_TEAM) + theme_bw() + 
  xlab("Duplicates in changed file (log scale)") +
  ggtitle("Observed cumulative frequency of existing duplicates")
)

(p <- data |> filter(committerteam %in% c(ARCH, BLUE, RED, GREEN), repo %in% c(INTTEST, JUPITER, URANUS)) |> group_by(repo, committerteam) |> ggplot(aes(x=ADD, color=committerteam)) + stat_ecdf() + facet_wrap(~ repo) +
  scale_x_continuous(trans="log1p", breaks=c(0,1,3,10,50,200, 1000)) + #, guide=guide_axis(angle=90)) +
  scale_color_manual(values=COLOR_BY_TEAM) + theme_bw() + 
  xlab("Number of added lines (log scale)") +
  ggtitle("Observed cumulative frequency of added lines")
)

(p <- data |> filter(committerteam %in% c(ARCH, BLUE, RED, GREEN), repo %in% c(INTTEST, JUPITER, URANUS)) |> group_by(repo, committerteam) |> ggplot(aes(x=DEL, color=committerteam)) + stat_ecdf() + facet_wrap(~ repo) +
  scale_x_continuous(trans="log1p", breaks=c(0,1,3,10,50,200, 1000, 4000)) +
  scale_color_manual(values=COLOR_BY_TEAM) + theme_bw() +
  xlab("Number of deleted lines (log scale)") +
    ggtitle("Observed cumulative frequency of deleted lines") #+ scale_y_continuous(limits = c(0.9,1))
)

quantile_table <- function(df) {
  df |> mutate(team=committerteam) |> group_by(repo, team) |> summarize(n=n(), q50ADD=median(ADD), q50DEL=median(DEL), q50COMP=median(COMPLEX), q50DUP=median(DUP), q75ADD=quantile(ADD, 0.75, type=3), q75DEL=quantile(DEL, 0.75, type=3), q75COMP=quantile(COMPLEX, 0.75, type=3), q75DUP=quantile(DUP, 0.75, type=3))
}
quantile_table(data |> filter(committerteam %in% c(ARCH, BLUE, RED, GREEN), repo %in% c(INTTEST, JUPITER, URANUS)))
## # A tibble: 12 × 11
## # Groups:   repo [3]
##    repo    team      n q50ADD q50DEL q50COMP q50DUP q75ADD q75DEL q75COMP q75DUP
##    <fct>   <fct> <int>  <dbl>  <dbl>   <dbl>  <dbl>  <dbl>  <dbl>   <dbl>  <dbl>
##  1 IntTest Arch    241      2      3      36      7     16     29     163     55
##  2 IntTest Blue   1030      8      3      58      8     41     14     167     54
##  3 IntTest Green   727      3      2      65     22     13      7     183     65
##  4 IntTest Red     617     10      4      48     13     47     25     211     66
##  5 Jupiter Arch    623      3     12      16      0     12     34      33      4
##  6 Jupiter Blue   2123     11      2      10      0     28      8      32      1
##  7 Jupiter Green  1172      7      3      21      0     27     11      58      3
##  8 Jupiter Red    2166      5      5      18      0     18     15      49      2
##  9 Uranus  Arch    120      2      8      22      0      8     45      33      1
## 10 Uranus  Blue   1047     12      2      14      0     37      9      36      1
## 11 Uranus  Green   584      6      2      16      0     26     12      35      1
## 12 Uranus  Red     386     11      2      10      0     54     14      25      0
data |> filter(committerteam %in% c(ARCH, BLUE, RED), repo == JUPITER, DEL <= 3) |> group_by(committerteam) |> tally()
## # A tibble: 3 × 2
##   committerteam     n
##   <fct>         <int>
## 1 Arch            168
## 2 Blue           1289
## 3 Red             907
data |> filter(committerteam %in% c(ARCH, BLUE, RED, GREEN), repo %in% c(INTTEST, JUPITER, URANUS)) |> group_by(repo, committerteam, COMPLEX, logCOMPLEX) |> ggplot(aes(x=COMPLEX, color=committerteam)) + stat_ecdf() + facet_wrap(~ repo) + 
    scale_color_manual(values=COLOR_BY_TEAM) + theme_bw() + scale_x_continuous(trans="log1p", breaks=c(0,3,10,50,200, 1000))

three_teams <- function(x) x %in% c(ARCH, RED, BLUE, GREEN)

(p <- plot_introd_issues_in_repo(INTTEST, three_teams) )
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_bar()`).

There seem to be some team-level variation, as well as repo-level variation in the number of introduced duplicates.

Comparing the largest repo, and the three largest contributors to that repo reveals that the Blue team is more likely to introduce a small amount of duplicates (it has almost double the amount of single-added duplicates as the Red team, which has a similar amount of contributions). But the red team has some more occurrences of 4-8 duplicates added to a single file.

(p <- plot_introd_issues_in_repo(JUPITER, function(x) x %in% c(RED, GREEN,BLUE)) )
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_bar()`).

Quantile plots

repo_committerteam |> ggplot(aes(x=team, y=repo)) +
    geom_tile(aes(fill=q95)) +
    geom_text(aes(label=round(q95, 0)), color="white") +
    xlab("team") + ylab("repo") +
    ggtitle(paste("Observed 95% introductions by team and repo"))

The 95% quantile plots show that for many repos, and many teams in those repos, in 19 out of 20 file changes, no duplicates were added. The outlier is INTTEST, where many teams can be expected to introduce single-digit duplicates. In the NEPTUNE repository, we also can expect some more duplicate introduction than in the others.

repo_committerteam |> ggplot(aes(x=team, y=repo)) +
    geom_tile(aes(fill=q99)) +
    geom_text(aes(label=round(q99, 0)), color="white") +
    xlab("team") + ylab("repo") +
    ggtitle(paste("Observed 99% introductions by team and repo"))

The pattern repeats for the 99% quantile. Integration test can be expected to have more duplicates introduced, and in particular the Brown team stands out. In the Neptune repo, however, it is the Blue and Green teams that are a bit more likely to introduce duplicates.

We also note that the Architect team are unlikely to introduce duplicates.

repo_committerteam|> ggplot(aes(x=team, y=repo)) +
    geom_tile(aes(fill=max)) +
    geom_text(aes(label=round(max, 0)), color="white") +
    xlab("team") + ylab("repo") +
    ggtitle(paste("Observed max number of introduced duplicates by team and repo"))

repo_committerteam_summary <- repo_committerteam |> mutate(files=replace_na(files, 0)) |> group_by(repo) |> summarize(mx=max(files), mn=min(files))
(p <- repo_committerteam |> inner_join(repo_committerteam_summary) |> mutate(p=(files - mn)/(mx-mn)) |> 
  ggplot(aes(x=team, y=repo)) +
    geom_tile(aes(fill=p)) +
    geom_text(aes(label=round(files, 0)), color="white") +
    xlab("team") + ylab("repo") +
    ggtitle("Observed number of changes to files by team and repo", "Heatmap colored by row")
)
## Joining with `by = join_by(repo)`

The Blue team are the most frequent in changing files, with Red and Green following. The Pink, UI and Unknown teams are much more distant.

repo_committerteam_summary <- repo_committerteam |> mutate(max=replace_na(max, 0)) |> group_by(repo) |> summarize(mx=max(max), mn=min(max))
(p <- repo_committerteam |> inner_join(repo_committerteam_summary) |> mutate(p=(max - mn)/(mx-mn)) |> 
  ggplot(aes(x=team, y=repo)) +
    geom_tile(aes(fill=p)) +
    geom_text(aes(label=round(max, 0)), color="white") +
    xlab("team") + ylab("repo") +
    ggtitle("Observed max number of introduced duplicates by team and repo", "Heatmap colored by row")
)
## Joining with `by = join_by(repo)`

The maximum number of introduced duplicates are more spread out between the teams - Blue, Red and Green all have the lead in one or two repositories. But Brown leads in two (including Integration tests, with the overall maximum of 150 introduced clones), and Yellow in Mercury, the smallest repository in terms of LOC.

repo_committerteam_summary <- repo_committerteam |> mutate(max=replace_na(q99, 0)) |> group_by(repo) |> summarize(mx=max(q99), mn=min(q99))
(p <- repo_committerteam |> inner_join(repo_committerteam_summary) |> mutate(p=(q99 - mn)/(mx-mn)) |> 
  ggplot(aes(x=team, y=repo)) +
    geom_tile(aes(fill=p)) +
    geom_text(aes(label=round(q99, 0)), color="white") +
    xlab("team") + ylab("repo") +
    ggtitle("Observed Q99 number of introduced duplicates by team and repo", "Heatmap colored by row")
)
## Joining with `by = join_by(repo)`

The 99th percentile is more spread out - but Brown still have a commanding lead in the Integration tests, where they introduced 103 duplicates in a single change.

data |> group_by(repo, committerteam) |> select(committer) |> distinct() |> tally() |> ggplot(aes(x=committerteam, y=repo)) +
    geom_tile(aes(fill=n)) +
    geom_text(aes(label=n), color="white") +
    xlab("team") + ylab("repo") +
    ggtitle(paste("Observed number of committers by team and repo"))
## Adding missing grouping variables: `repo`, `committerteam`

data |> group_by(repo, authorteam) |> select(author) |> distinct() |> tally() |> ggplot(aes(x=authorteam, y=repo)) +
    geom_tile(aes(fill=n)) +
    geom_text(aes(label=n), color="white") +
    xlab("team") + ylab("repo") +
    ggtitle(paste("Observed number of authors by team and repo"))
## Adding missing grouping variables: `repo`, `authorteam`

The maximum number of introduced duplicates reveals that the Brown team, which also had high 95% value in the IntTest repo, also introduced 150 duplicates, the overall max value, to a single file in the integration test repository. Overall, we see the pattern repeat:

  • Integration tests are highly likely to involve additions of duplicates
  • The architect team overall is highly unlikely to introduce any duplicates
  • There seems to be differences between teams in the numer of introduced duplicates, relative to their contributions. Differences between Red and Blue team, and also the Brown team stands out for introducing many duplicates in at least the integration test repository.
data |> filter(INTROD >0) |> ggplot(aes(x=D, y=INTROD, color=C)) + geom_point()

data |> filter(INTROD >0) |> ggplot(aes(x=C, y=INTROD, color=D)) + geom_point()

Those plots show that complexity, rather than number of existing duplicates, are more linearly related to number of introduced duplicates. At the same time, complexity and existing duplicates are also related - it is just that even if DUP is zero, there is no strong evidence that INTROD is low or zero.

data |> filter(INTROD>0) |> ggplot(aes(x=C, y=D, color=INTROD)) + geom_point()

Distribution of parameters

Added lines

data |> ggplot(aes(x=repo, y=(1+ADD))) + geom_violin() + geom_boxplot(width=0.1) + scale_y_continuous(trans='log', breaks=c(5, 20, 100, 500, 1000, 3000)) + ylab("Added lines")

Overall, at the median, every repository have less than 10 added lines. Some outliers occur.

data |> ggplot(aes(x=committerteam, y=(1+ADD))) + geom_violin() + geom_boxplot(width=0.1) + scale_y_continuous(trans='log', breaks=c(5, 20, 100, 500, 1000, 3000)) + ylab("Added lines")

Between the teams (regardless of repository), there are more variation.

Deleted lines

data |> ggplot(aes(x=committerteam, y=(1+DEL))) + geom_violin() + geom_boxplot(width=0.1) + scale_y_continuous(trans='log', breaks=c(5, 20, 100, 500, 1000, 3000)) + ylab("Deleted lines")

Existing clones

data |> ggplot(aes(x=committerteam, y=(1+DUP))) + geom_violin() + geom_boxplot(width=0.1) + scale_y_continuous(trans='log', breaks=c(5, 20, 100, 500)) + ylab("Existing duplicates")

Changed file complexity

data |> ggplot(aes(x=committerteam, y=(1+COMPLEX))) + geom_violin() + geom_boxplot(width=0.1) + scale_y_continuous(trans='log', breaks=c(5, 20, 100, 500, 1000)) + ylab("McCabe complexity")

OCAM metrics

Data definition:

  • churn: is the per-file-change max value of added and removed lines. So a file change with 2 added and 20 removed lines will have a churn value of 20.
  • addcomplex: is the per-file-added McCabe complexity, or 0 in case the complexity of the file decreased due to the change.
  • delcomplex: is the per-file-deleted McCabe complexity, or 0 in case the complexity of the file increased due to the change.
  • commits: the number of unique commits made by the team

The above definition means that, at minimum, one of either addcomplex or delcomplex is zero, for any given file change. The only way for the churn value to be zero is if an empty file (containing no lines) was removed. This happens four times in the data set.

ocam_data <- data


(p <- ocam_rank_repo(ocam_metrics(ocam_data) |> dplyr::filter(repo == INTTEST)) + theme(legend.position = "none"))

ocam_data <- data
(p <- ocam_rank_repo(ocam_metrics(ocam_data) |> filter(repo == INTTEST)) + theme_bw() )

(p <- ocam_rank_repo(ocam_metrics(ocam_data) |> filter(repo == JUPITER)) + theme_bw() )

(p <- ocam_rank_repo(ocam_metrics(ocam_data) |> filter(repo == SATURN)) + theme_bw() )

(p <- ocam_rank_repo(ocam_metrics(ocam_data) |> filter(repo == URANUS)) + theme_bw() )

ocam_rank_repo(ocam_metrics(ocam_data) |> filter(repo == NEPTUNE)) + theme_bw()

ocam_rank_repo(ocam_metrics(ocam_data) |> filter(repo == VENUS)) + theme_bw()

(p <- ocam_rank_repo(ocam_metrics(ocam_data) |> filter(repo == MARS))  + theme_bw())

ocam_rank_repo(ocam_metrics(ocam_data) |> filter(repo == MERCURY)) + theme_bw()

ocam_rank_repo(ocam_metrics(ocam_data) |> filter(repo %in% c(JUPITER, URANUS))) + theme_bw() + theme(legend.position = "none") + ylab(NULL)

Combining repos in the same plot allows for contrasting pictures, like this for Jupiter and Uranus.

Conditional independencies

We can test whether our data supports the conditional independencies implied by our DAG.

REM _||_ COMP | TEAM

and

REM _||_ DUP | TEAM

These would seem to imply that “knowing TEAM, makes REM and COMPLEX” independent (similar for DUP). Do our data support that claim?

Is REMOVED independent of COMPLEXITY, once TEAM is considered?

d <- data |> select(y=INTROD,
                    A=A,
                    C=C,
                    D=D,
                    R=R,
                    team=committerteam,
                    repo=repo)
formula <- bf(C ~ R + team)
get_prior(data=d,
          family=gaussian,
          formula=formula)
##                   prior     class        coef group resp dpar nlpar lb ub
##                  (flat)         b                                        
##                  (flat)         b           R                            
##                  (flat)         b    teamBlue                            
##                  (flat)         b   teamBrown                            
##                  (flat)         b   teamGreen                            
##                  (flat)         b  teamOrange                            
##                  (flat)         b    teamPink                            
##                  (flat)         b     teamRed                            
##                  (flat)         b      teamUI                            
##                  (flat)         b teamUnknown                            
##                  (flat)         b  teamViolet                            
##                  (flat)         b  teamYellow                            
##  student_t(3, 0.1, 2.5) Intercept                                        
##    student_t(3, 0, 2.5)     sigma                                    0   
##        source
##       default
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##       default
##       default
priors <- c(prior(normal(0, 0.5), class = Intercept),
            prior(normal(0, 0.5), class = b),
            prior(exponential(1), class = sigma)
            )

(v <- validate_prior(prior=priors,
               formula=formula,
               data=d,
               family=gaussian)
)
##           prior     class        coef group resp dpar nlpar lb ub       source
##  normal(0, 0.5)         b                                                 user
##  normal(0, 0.5)         b           R                             (vectorized)
##  normal(0, 0.5)         b    teamBlue                             (vectorized)
##  normal(0, 0.5)         b   teamBrown                             (vectorized)
##  normal(0, 0.5)         b   teamGreen                             (vectorized)
##  normal(0, 0.5)         b  teamOrange                             (vectorized)
##  normal(0, 0.5)         b    teamPink                             (vectorized)
##  normal(0, 0.5)         b     teamRed                             (vectorized)
##  normal(0, 0.5)         b      teamUI                             (vectorized)
##  normal(0, 0.5)         b teamUnknown                             (vectorized)
##  normal(0, 0.5)         b  teamViolet                             (vectorized)
##  normal(0, 0.5)         b  teamYellow                             (vectorized)
##  normal(0, 0.5) Intercept                                                 user
##  exponential(1)     sigma                                    0            user
M_cond_ind <-
  brm(data = d,
      family = gaussian,
      formula = formula,
      prior = priors,
      file = cachefile("eda-M_cond_ind_R_C"),
      warmup = 1000,
      iter  = ITERATIONS,
      chains = CHAINS,
      cores = CORES,
      backend="cmdstanr",
      file_refit = "on_change",
      threads = threading(THREADS),
      save_pars = SAVE_PARS,
      adapt_delta = ADAPT_DELTA)
summary(M_cond_ind)
##  Family: gaussian 
##   Links: mu = identity; sigma = identity 
## Formula: C ~ R + team 
##    Data: d (Number of observations: 31007) 
##   Draws: 4 chains, each with iter = 4000; warmup = 1000; thin = 1;
##          total post-warmup draws = 12000
## 
## Population-Level Effects: 
##             Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept      -0.13      0.02    -0.18    -0.08 1.00     2875     4078
## R              -0.01      0.01    -0.02     0.00 1.00    13552     9336
## teamBlue        0.04      0.03    -0.01     0.09 1.00     3134     5495
## teamBrown      -0.18      0.03    -0.24    -0.12 1.00     3739     6261
## teamGreen       0.19      0.03     0.13     0.24 1.00     3341     5645
## teamOrange      0.23      0.03     0.17     0.29 1.00     3803     5988
## teamPink        0.27      0.05     0.17     0.36 1.00     5714     6958
## teamRed         0.31      0.03     0.25     0.36 1.00     3418     5696
## teamUI          0.64      0.12     0.40     0.88 1.00     9529     7916
## teamUnknown    -0.19      0.05    -0.28    -0.10 1.00     5787     7629
## teamViolet      0.51      0.04     0.44     0.58 1.00     4337     7033
## teamYellow      0.08      0.03     0.02     0.14 1.00     3928     6672
## 
## Family Specific Parameters: 
##       Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma     0.99      0.00     0.98     0.99 1.00    16772     8598
## 
## Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

We are only really interested in population-level effects of R on C here (ignoring the team coefficients). And this clearly shows that 0 is within the likely betas (-0.02 to 0)

Is REMOVED independent of DUPLICATES, once TEAM is considered?

formula <- bf(D ~ R + team)
get_prior(data=d,
          family=gaussian,
          formula=formula)
##                    prior     class        coef group resp dpar nlpar lb ub
##                   (flat)         b                                        
##                   (flat)         b           R                            
##                   (flat)         b    teamBlue                            
##                   (flat)         b   teamBrown                            
##                   (flat)         b   teamGreen                            
##                   (flat)         b  teamOrange                            
##                   (flat)         b    teamPink                            
##                   (flat)         b     teamRed                            
##                   (flat)         b      teamUI                            
##                   (flat)         b teamUnknown                            
##                   (flat)         b  teamViolet                            
##                   (flat)         b  teamYellow                            
##  student_t(3, -0.6, 2.5) Intercept                                        
##     student_t(3, 0, 2.5)     sigma                                    0   
##        source
##       default
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##       default
##       default
priors <- c(prior(normal(0, 0.5), class = Intercept),
            prior(normal(0, 0.5), class = b),
            prior(exponential(1), class = sigma)
            )

(v <- validate_prior(prior=priors,
               formula=formula,
               data=d,
               family=gaussian)
)
##           prior     class        coef group resp dpar nlpar lb ub       source
##  normal(0, 0.5)         b                                                 user
##  normal(0, 0.5)         b           R                             (vectorized)
##  normal(0, 0.5)         b    teamBlue                             (vectorized)
##  normal(0, 0.5)         b   teamBrown                             (vectorized)
##  normal(0, 0.5)         b   teamGreen                             (vectorized)
##  normal(0, 0.5)         b  teamOrange                             (vectorized)
##  normal(0, 0.5)         b    teamPink                             (vectorized)
##  normal(0, 0.5)         b     teamRed                             (vectorized)
##  normal(0, 0.5)         b      teamUI                             (vectorized)
##  normal(0, 0.5)         b teamUnknown                             (vectorized)
##  normal(0, 0.5)         b  teamViolet                             (vectorized)
##  normal(0, 0.5)         b  teamYellow                             (vectorized)
##  normal(0, 0.5) Intercept                                                 user
##  exponential(1)     sigma                                    0            user
M_cond_ind <-
  brm(data = d,
      family = gaussian,
      formula = formula,
      prior = priors,
      file = cachefile("eda-M_cond_ind_R_D"),
      warmup = 1000,
      iter  = ITERATIONS,
      chains = CHAINS,
      cores = CORES,
      backend="cmdstanr",
      file_refit = "on_change",
      threads = threading(THREADS),
      save_pars = SAVE_PARS,
      adapt_delta = ADAPT_DELTA)
summary(M_cond_ind)
##  Family: gaussian 
##   Links: mu = identity; sigma = identity 
## Formula: D ~ R + team 
##    Data: d (Number of observations: 31007) 
##   Draws: 4 chains, each with iter = 4000; warmup = 1000; thin = 1;
##          total post-warmup draws = 12000
## 
## Population-Level Effects: 
##             Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept       0.01      0.02    -0.03     0.06 1.00     2702     3996
## R               0.12      0.01     0.11     0.13 1.00    12194     8426
## teamBlue       -0.09      0.03    -0.14    -0.04 1.00     2915     4761
## teamBrown      -0.19      0.03    -0.25    -0.13 1.00     3585     5710
## teamGreen       0.10      0.03     0.05     0.16 1.00     3201     5638
## teamOrange     -0.01      0.03    -0.07     0.05 1.00     3537     5977
## teamPink       -0.10      0.05    -0.20    -0.00 1.00     5926     7085
## teamRed         0.05      0.03    -0.00     0.11 1.00     3197     5328
## teamUI          0.49      0.12     0.25     0.73 1.00    10398     8228
## teamUnknown    -0.25      0.05    -0.34    -0.16 1.00     5866     6725
## teamViolet      0.21      0.04     0.15     0.28 1.00     4091     6485
## teamYellow      0.01      0.03    -0.05     0.07 1.00     3705     6102
## 
## Family Specific Parameters: 
##       Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma     0.99      0.00     0.98     0.99 1.00    16685     8748
## 
## Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

There might be some weak correlation between DUP (D) and REM (R) beta 0.12 and CI between 0.11 and 0.13

Test versus non-test code behaviour

plot_nontest_INTROD_for_team <- function(df, team) {
  df |> filter(committerteam==team, ISTEST==F) |> ggplot(aes(x=INTROD, fill=repo)) + geom_histogram(aes(y=after_stat(count/sum(count))), binwidth = 1) + scale_y_continuous(labels=scales::percent) + ggtitle(paste0("Introduced duplicates in non-test code by team ", team))
}
plot_test_INTROD_for_team <- function(df, team) {
  df |> filter(committerteam==team, ISTEST==T) |> ggplot(aes(x=INTROD, fill=repo)) + geom_histogram(aes(y=after_stat(count/sum(count))), binwidth = 1) + scale_y_continuous(labels=scales::percent) + ggtitle(paste0("Introduced duplicates in test code by team ", team))
}
plot_INTROD_for_team <- function(df, team) {
  df |> filter(committerteam==team) |> ggplot(aes(x=INTROD, fill=repo)) + geom_histogram(aes(y=after_stat(count/sum(count))), binwidth = 1) + scale_y_continuous(labels=scales::percent) + ggtitle(paste0("Introduced duplicates by team ", team))
}

Team Blue - test vs. non-test code

(p <- plot_nontest_INTROD_for_team(data, BLUE) + ylim(0,0.03) )
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 10 rows containing missing values or values outside the scale range
## (`geom_bar()`).

(p <- plot_test_INTROD_for_team(data, BLUE) + ylim(0,0.03) )
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_bar()`).

(p <- plot_INTROD_for_team(data, BLUE) + ylim(0,0.03) )
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_bar()`).

Some repositories (such as IntTest) have lots of test code. Adding the ISTEST predictor does not seem to add more information than the usual repository information (shapes of the histograms are similar).

Team Red - test vs. non-test code

(p <- plot_nontest_INTROD_for_team(data, RED) + ylim(0,0.025) )
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_bar()`).

(p <- plot_test_INTROD_for_team(data, RED) + ylim(0,0.025) )
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_bar()`).

(p <- plot_INTROD_for_team(data, RED) + ylim(0,0.025) )
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_bar()`).

Similar conclusion for Red.

Team Green - test vs. non-test code

(p <- plot_nontest_INTROD_for_team(data, GREEN) + ylim(0,0.05) )
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_bar()`).

(p <- plot_test_INTROD_for_team(data, GREEN) + ylim(0,0.05) )
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_bar()`).

There’s a small amount of zeros in the Venus repository, visible in the plot as well (the other zeros are excluded by the ylim).

(p <- plot_INTROD_for_team(data, GREEN) + ylim(0,0.05) )
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_bar()`).

IntTest and Architects

Is it true that Architects rarely introduce new files in the Integration Tests?

data |> filter(repo == INTTEST) |> group_by(committerteam, ISNEW) |> summarize(count=n(), pct=100*count/sum(count)) |> mutate(ratio=round(100*count/sum(count), 1))
## # A tibble: 22 × 5
## # Groups:   committerteam [11]
##    committerteam ISNEW count   pct ratio
##    <fct>         <lgl> <int> <dbl> <dbl>
##  1 Arch          FALSE   227   100  94.2
##  2 Arch          TRUE     14   100   5.8
##  3 Blue          FALSE   937   100  91  
##  4 Blue          TRUE     93   100   9  
##  5 Brown         FALSE   308   100  92.8
##  6 Brown         TRUE     24   100   7.2
##  7 Green         FALSE   713   100  98.1
##  8 Green         TRUE     14   100   1.9
##  9 Orange        FALSE   269   100  95.4
## 10 Orange        TRUE     13   100   4.6
## # ℹ 12 more rows

Not really… 14 new files out of 241 (5.8%), whereas Green and Orange also have 14 and 13, respectively, corresponding to 1.9% and 4.6%. So, no conclusive evidence that other teams introduce more integration test files than the Architects.

(p <- data |> filter(repo == JUPITER) |> group_by(committerteam, DEL) |> ggplot(aes(x=DEL, fill=committerteam)) + geom_histogram(binwidth=30) + facet_wrap(~ committerteam) + scale_y_continuous(trans=scales::pseudo_log_trans(base = 10), breaks=c(1, 10,100,1000, 5000)) +
ylab("log10(count)") + scale_fill_manual(values=COLOR_BY_TEAM) + theme_bw() + ggtitle("Deleted lines per team", "Jupiter repository") 
)

Possibly more deletions done by Architects, however. But Red and Yellow also seem to delete a lot of lines.

Additions versus deletions

(p <- data |> mutate(team=committerteam) |> filter(repo %in% c(JUPITER, URANUS), team %in% c(RED, BLUE)) |> ggplot(aes(y=ADD, x=DEL, size=logCOMPLEX, colour=team, shape=team)) + geom_point() +
   scale_x_continuous(trans=scales::pseudo_log_trans(base = 10), breaks=c(1, 10,100,1000, 2000)) + scale_y_continuous(trans=scales::pseudo_log_trans(base = 10), breaks=c(1, 10,100,1000, 2000)) +
   scale_color_manual(values=c(Blue=rgb(31,120,180,40, maxColorValue = 255), Red=rgb(227,26,28,40, maxColorValue = 255))) +  theme_bw() + facet_wrap(~repo)+ ggtitle("Added and changed lines per team and repo, sized by complexity of the file") + theme(legend.position="bottom") )

Comparing how Red and Blue perform additions an deletions reveal that Red is more active in the Jupiter repo, and Blue in Uranus. Pure additions correspond to the \(y\)-axis, and pure deletions are the \(x\)-axis. Line changes correspond to the \(y=x\) line, which is clearly visible in both plots.

(p <- data |> filter(repo %in% c(JUPITER, URANUS, MARS), committerteam %in% c(RED, BLUE, GREEN)) |> ggplot(aes(y=ADD, x=DEL, size=logCOMPLEX, colour=committerteam, shape=committerteam)) + geom_point() +
   scale_x_continuous(trans=scales::pseudo_log_trans(base = 10), breaks=c(1, 10,100,1000, 2000)) + scale_y_continuous(trans=scales::pseudo_log_trans(base = 10), breaks=c(1, 10,100,1000, 2000)) +
   scale_color_manual(values=c(Blue=rgb(31,120,180,40, maxColorValue = 255), Red=rgb(227,26,28,40, maxColorValue = 255), Green=rgb(51,160,44,40, maxColorValue = 255))) + scale_alpha(0.1) +  theme_bw() + facet_wrap(~repo)+ ggtitle("Added and changed lines per team and repo, sized by complexity of the file") )

Adding the Green team, and Mars repository shows that Green is active in Mars, but also in Uranus.

Hard to draw any certain conclusions from these plots, however.

Clone introductions versus added lines

data |> filter(repo == MARS, committerteam %in% c(BLUE, GREEN)) |> ggplot(aes(y=INTROD, x=ADD, colour=committerteam, size=COMPLEX)) + geom_point() +
   scale_x_continuous(trans=scales::pseudo_log_trans(base = 10), breaks=c(1, 10,100,1000, 2000)) +
   scale_color_manual(values=c(Blue=rgb(31,120,180,75, maxColorValue = 255), Red=rgb(227,26,28,40, maxColorValue = 255), Green=rgb(51,160,44,75, maxColorValue = 255))) + 
  theme_bw() + ggtitle("Introduced duplicates per added lines, sized by complexity of the file", "Mars repository") 

data |> filter(repo == JUPITER, committerteam %in% c(BLUE, RED)) |> ggplot(aes(y=INTROD, x=ADD, colour=committerteam, size=COMPLEX)) + geom_point() +
   scale_x_continuous(trans=scales::pseudo_log_trans(base = 10), breaks=c(1, 10,100,1000, 2000)) +
   scale_color_manual(values=c(Blue=rgb(31,120,180,75, maxColorValue = 255), Red=rgb(227,26,28,40, maxColorValue = 255), Green=rgb(51,160,44,75, maxColorValue = 255))) + 
  theme_bw() + ggtitle("Introduced duplicates per added lines, sized by complexity of the file", "Jupiter repository") 

data |> filter(repo == INTTEST, committerteam %in% c(BLUE, RED)) |> ggplot(aes(y=INTROD, x=ADD, colour=committerteam, size=COMPLEX)) + geom_point() +
   scale_x_continuous(trans=scales::pseudo_log_trans(base = 10), breaks=c(1, 10,100,1000, 2000)) +
   scale_color_manual(values=c(Blue=rgb(31,120,180,75, maxColorValue = 255), Red=rgb(227,26,28,40, maxColorValue = 255), Green=rgb(51,160,44,75, maxColorValue = 255))) + 
  theme_bw() + ggtitle("Introduced duplicates per added lines, sized by complexity of the file", "Int.test repository") 

The tendency of introducing clones increase with the number of added lines, as expected.